rm(list = ls(all = TRUE))

library(quanteda)

### PATHS ##############################
dataIn <- "./data/policy agendas codebook/"
dataOut <- "./data/"
#######################################

# create a corpus from the cleaned policy agenda category descriptions
fname <- file.path(dataIn,"*.txt")
paSource <- textfile(fname, docvarsfrom = c("filenames"))
paCorpus <- corpus(paSource)

# create a dfm, removing stopwords, and using stemming
padfm <- dfm(paCorpus,
             ignoredFeatures=c("e.g", "i.e", stopwords("english")),
             stem=TRUE)

# transform to matrix
m <- as.matrix(padfm)

# reduce to topic word list (i.e., freq in row > 0)
a <- apply(m, 1, function(x) paste(names(x[x>0]), collapse=" "))

# word counts for paper
apply(m, 1, function(x) paste(names(x[x>0]), collapse=" "))

# write to txt file
fname <- file.path(dataOut, "policy_agenda_word_lists.txt")
write(a, fname)

# labels
lab <- rownames(padfm)
lab <- sapply(strsplit(lab, split="-"), "[[", 2)
lab <- gsub("\\.txt", "", lab)
lab <- tolower(lab)

# write to txt file
fname <- file.path(dataOut, "policy_agenda_topic_labels.txt")
write(lab, fname)


